# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)
#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)
suggestions, vector_data = vectorize_suggestions(suggestions_df)
# retrieve unique suggestions and their vectors
suggestions = [x for x in suggestions if x]
for i in range(len(suggestions)):
if suggestions.count(suggestions[i]) > 1:
suggestions[i] = None
vector_data[i] = None
suggestions = [x for x in suggestions if x]
vector_data = vector_data[~np.isnan(vector_data).any(axis=1)]
vector_data = np.asarray(vector_data)
# tsne transformation to 2d
tsne = TSNE(n_components=2, random_state=1410)
X_tsne = tsne.fit_transform(vector_data)
%reload_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'min_samples':'Minimale Anzahl Punkte pro Cluster',
'silhouette_score':'Silhouette Score',
'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)
silhouette_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
values=dbscan_scores['Silhouette Score'], aggfunc='mean')
nnoise_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
values=dbscan_scores['Anzahl Rauschpunkte'], aggfunc='mean')
ncluster_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
values=dbscan_scores['Anzahl Cluster'], aggfunc='mean')
fig = make_subplots(rows=1, cols=3, subplot_titles=('Silhouette Score', 'Anzahl Rauschpunkte', 'Anzahl Cluster'),
shared_yaxes=True, horizontal_spacing=0.15)
fig.add_trace(go.Heatmap(z=silhouette_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
y=dbscan_scores['Maximale Distanz'].unique(),
colorscale=px.colors.sequential.RdBu, colorbar_x=0.233, name='Silhoutte Scores'),
row=1, col=1)
fig.add_trace(go.Heatmap(z=nnoise_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
y=dbscan_scores['Maximale Distanz'].unique(),
colorscale=px.colors.sequential.RdBu_r, colorbar_x=0.618, name='Anzahl Rauschpunkte'),
row=1, col=2)
fig.add_trace(go.Heatmap(z=ncluster_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
y=dbscan_scores['Maximale Distanz'].unique(),
colorscale=px.colors.sequential.RdBu_r, colorbar_x=1, name='Anzahl Cluster'),
row=1, col=3)
fig.update_traces(hovertemplate='%{z}')
fig.update_annotations(font_size=18)
fig.update_yaxes(title='Epsilon', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=2)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=3)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
# Methode 1: Anzahl Rauschpunkte nicht höher als 30%, dann nach Anzahl Cluster
dbscan_scores[dbscan_scores['Anzahl Rauschpunkte']<len(X_tsne)*0.3].sort_values(by='Anzahl Cluster', ascending=True)
# Methode 2: Anzahl Cluster nicht höher als 200, dann nach Anzahl Rauschpunkte
dbscan_scores[dbscan_scores['Anzahl Cluster']<200].sort_values(by='Anzahl Rauschpunkte', ascending=True)
from sklearn import cluster, metrics
dbscan = cluster.DBSCAN(eps=0.95, min_samples=7).fit(X_tsne)
labels = dbscan.labels_
# drop noise points from labels
tmp = pd.DataFrame()
tmp['labels'] = labels
tmp['vector'] = X_tsne.tolist()
tmp = tmp[tmp['labels']!=-1]
labels_clean = tmp['labels'].tolist()
vectors_clean = np.array(tmp['vector'].tolist())
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f'Silhouette Score w/o noise points: {metrics.silhouette_score(vectors_clean, labels_clean)}')
print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')
print(f'Noise in percent: {n_noise/len(labels)*100}%')
# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in X_tsne]
output_df['cluster'] = output_df['cluster'].apply(str)
# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
tmp = pd.DataFrame()
tmp['Cluster'] = output_df['Cluster'].value_counts().index
tmp['Clustergröße'] = output_df['Cluster'].value_counts().values
fig = px.box(tmp[tmp['Cluster']!='-1'], y='Clustergröße', points='all',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()